rm(list=ls(all=TRUE))

library(sampleSelection) # Will use for extension, but not Heckman probit
?selection
?heckit

library(foreign)
getwd()
setwd("C:/Users/Kathryn/Downloads")
Henne = read.dta("Henne_2012/Henne.dta")

head(Henne)

Hen = data.frame(ccode1=Henne$ccode1,
                 ccode2=Henne$ccode2,
                 country1=Henne$country1,
                 country2=Henne$country2,
                 alliance=Henne$alliance,
                 contig=Henne$contig,
                 distance=Henne$distance,
                 trade=Henne$trade,
                 democracy=Henne$democracy,
                 samereligion=Henne$samereligion,
                 MID=Henne$MID,
                 hostilitydichot=Henne$hostilitydichot,
                 disputeseverity=Henne$disputeseverity,
                 majorpower=Henne$majorpower,
                 relativepower=Henne$relativepower,
                 territorialconflict=Henne$territorialconflict,
                 civildyad_narrow=Henne$civildyad_narrow,
                 civildyad_broad=Henne$civildyad_broad,
                 religioussecularmixed=Henne$religioussecularmixed,
                 religiousdyad=Henne$religiousdyad,
                 civildyad_narrow=Henne$civildyad_narrow,
                 civilsecularmixed=Henne$civilsecularmixed,
                 religiouscivilmixed=Henne$religiouscivilmixed,
                 religioussecularmixed_broad=Henne$religioussecularmixed_broad,
                 religiouspassivesecularmixed=Henne$religiouspassivesecularmixed,
                 religiousantireligiousmixed=Henne$religiousantireligiousmixed,
                 peaceyears=Henne$peaceyears,
                 sameideological=Henne$sameideological,
                 logdistance=Henne$logdistance,
                 NUMIGO=Henne$NUMIGO,
                 disputeseverityp=Henne$disputeseverityp,
                 disputeseverityp2=Henne$disputeseverityp2,
                 spline1=Henne[,91], #underscore character he uses for his variable names
                 spline2=Henne[,92], #messes R up, so index in to get them
                 spline3=Henne[,93] 
)

## Attempt to log large variable Trade to facilitate Daina's Heckman Probit code.
## It didn't actually seem to make any difference.
# Henne=na.omit(Henne)
# min(Henne$tradetotal)
# Henne$tradetotaltransformed=Henne$tradetotal + 19
# Henne$trade=log(Henne$tradetotaltransformed)

## Ordinary probit models (Henne's models 1 & 2)

#hostilitydichot religiousdyad civildyad_narrow civilsecularmixed religiouscivilmixed religiouspassivesecularmixed religiousantireligiousmixed trade democracy samereligion relativepower logdistance NUMIGO territorialconflict peaceyears if MID==1 
#Model 2
prob = glm(hostilitydichot ~ religiousdyad + civildyad_narrow +
           civilsecularmixed + religiouscivilmixed + religiouspassivesecularmixed +
           religiousantireligiousmixed + trade + democracy + samereligion +
           relativepower + logdistance + NUMIGO + territorialconflict + peaceyears,
           data=Hen, subset=MID==1, family=binomial(link="probit"))
summary(prob)


#prob MID religiousdyad civildyad_narrow civilsecularmixed religiouscivilmixed religiouspassivesecularmixed religiousantireligiousmixed trade democracy samereligion relativepower logdistance NUMIGO alliance peaceyears _spline1 _spline2 _spline3 
#Model 1
prob2 = glm(MID ~ religiousdyad + civildyad_narrow + civilsecularmixed + 
           religiouscivilmixed + religiouspassivesecularmixed + 
           religiousantireligiousmixed + trade + democracy + samereligion +
           relativepower + logdistance + NUMIGO + alliance + peaceyears +
           spline1 + spline2 + spline3,
           data=Hen, family=binomial(link="probit"))
summary(prob2)
library(apsrtable)
apsrtable(prob2,prob)
#Okay, now we just need the Heckman probit.


#Extension heckman selection model with continuous DV (dispute severity)
# First step probit, second step OLS
heck = heckit(MID ~ religiousdyad 
              + civildyad_narrow + civilsecularmixed + religiouscivilmixed + 
                religiouspassivesecularmixed + religiousantireligiousmixed + trade + democracy +
                samereligion + relativepower + logdistance + NUMIGO + alliance + peaceyears +
                spline1 + spline2 + spline3, 
              disputeseverity ~ religiousdyad + civildyad_narrow +
                civilsecularmixed + religiouscivilmixed + religiouspassivesecularmixed +
                religiousantireligiousmixed + trade + democracy + samereligion +
                relativepower + logdistance + NUMIGO + territorialconflict + peaceyears,  Hen)
coef = heck$coefficients
coef
summary(heck)


## Attempting Heckman Probit model--at this point, it doesn't work
require(VGAM)
require(mvtnorm)
require(corpcor)

## =========================================== ##
##  Heckman probit: Log-likelihood function - Code setup by Daina Chiba
## =========================================== ##
heckprob.ll <- function(pars,data){ #
  ## rho constrained b/ween -1, 1
  rho <- (exp(2*pars[length(pars)])-1)/(exp(2*pars[length(pars)])+1) 
  ## design matrix for the selection stage
  xg <- data$x.sel %*% pars[1:ncol(data$x.sel)] 
  ## design matrix for the outcome stage
  xb <- data$x.out %*% pars[(ncol(data$x.sel)+1):(ncol(data$x.sel)+ncol(data$x.out))]
  p0 <- pnorm(-xg)                         ## Pr(sel=0)
  p2 <- pmax(pnorm2(xg,  xb,  rho), 1e-10) ## Pr(sel=1 & out=1) - greater than 0 (tiny number)  - rho bounded btween -1 and 1
  p1 <- pmax(1-p0-p2, 1e-10)               ## Pr(sel=1 & out=0)
  ## Use pmax to avoid p1<0 and p1==0
  loglik <- (data$y==0)*log(p0) + (data$y==1)*log(p1) + (data$y==2)*log(p2)
  return(sum(loglik))
}

## ==================================================
##  some functions for ML implementation
## ==================================================
## wrapper function
estim.mle <- function(likfn, data, method="BFGS",r=1){ #maximizes value 
  kg <- ncol(data$x.sel) #number variables
  kb <- ncol(data$x.out) #number variables eq 2
  k <- kg + kb + r
  set.seed(123)
  out <- try(optim(par = rep(0,k),  ## initial values
                   fn = likfn,
                   data = data,
                   method = method, ## BFGS, Nelder-Mead, SANN, CG
                   control = list(fnscale = -1, maxit = 10000),
                   hessian = T), silent=T)
}
?optim
## function to display results
display.mle <- function(fit, rnd=3){
  if(class(fit)=="try-error") { return(0)}
  else {
    beta <- fit$par #beta, gamma, and rho
    vcov <- -solve(fit$hessian)
    vcov <- make.positive.definite(vcov)
    se <- sqrt(diag(vcov))
    z <- beta/se
    pval <- 2*(1-pnorm(abs(beta/se)))
    if (fit$convergence==0){
      rnd <- rnd}
    else rnd <- 0
    out <- cbind(beta = round(beta,rnd),
                 se = round(se,rnd),
                 z = round(z,rnd),
                 pval = round(pval,rnd))
    return(out)
  }
}

## function to transform atanh(rho) back to rho
get.rho <- function(x) (exp(2*x)-1)/(exp(2*x)+1)

require(foreign)

analysis.data <- Henne


## Outcome: private
## Selection: vote

## Note that we assume that we observe whether children attend private school only if 
## the family votes for increasing the property taxes. This is not true in the dataset.
## This untrue assumption is made only to illustrate the use of the command and to compare
## R result with Stata result. 

#MID ~ religiousdyad 
#+ civildyad_narrow + civilsecularmixed + religiouscivilmixed + 
 # religiouspassivesecularmixed + religiousantireligiousmixed + trade + democracy +
  #samereligion + relativepower + logdistance + NUMIGO + alliance + peaceyears +
  #spline1 + spline2 + spline3, 
#hostilitydichot ~ religiousdyad + civildyad_narrow +
 # civilsecularmixed + religiouscivilmixed + religiouspassivesecularmixed +
  #religiousantireligiousmixed + trade + democracy + samereligion +
  #relativepower + logdistance + NUMIGO + territorialconflict + peaceyears

eqn.sel <- as.formula(MID ~ religiousdyad 
                      + civildyad_narrow + civilsecularmixed + religiouscivilmixed + 
                        religiouspassivesecularmixed + religiousantireligiousmixed) ## Selection equation
eqn.out <- as.formula(hostilitydichot ~ religiousdyad + civildyad_narrow +
  civilsecularmixed + religiouscivilmixed + religiouspassivesecularmixed +
religiousantireligiousmixed)       ## Outcome equation

## Extract relevant variables from the data set
name.of.y.sel <- rownames(attr(terms(eqn.sel),"factors"))[1]
name.of.y.out <- rownames(attr(terms(eqn.out),"factors"))[1]
names.of.x.sel <- attr(terms(eqn.sel),"term.labels")
names.of.x.out <- attr(terms(eqn.out),"term.labels")
x.sel <- as.matrix(analysis.data[, match(names.of.x.sel,colnames(analysis.data))])
x.out <- as.matrix(analysis.data[, match(names.of.x.out,colnames(analysis.data))])
xmat.sel <- cbind(x.sel,1) ## add intercept
xmat.out <- cbind(x.out,1) ## add intercept
y.sel <- analysis.data[, match(name.of.y.sel, colnames(analysis.data))]
y.out <- analysis.data[, match(name.of.y.out, colnames(analysis.data))]
## recode Y
y <- y.sel
y[y.sel==1 & y.out==0] <- 1
y[y.sel==1 & y.out==1] <- 2

private.vote.data <- list(x.sel=xmat.sel, x.out=xmat.out,y=y) 

## Estimate the model
fit.heck.private.vote <- estim.mle(likfn=heckprob.ll, data=private.vote.data)

## Display results
tbl <- display.mle(fit.heck.private.vote)
rownames(tbl) <- c(attr(terms(eqn.sel),"term.labels"),"cons",
                   attr(terms(eqn.out),"term.labels"),"cons", "ath(rho)")
print(tbl)
get.rho(tbl[nrow(tbl),1])

#vector of coefficients (output) %*% means/mode/median of covariates - getting predicted probabilities

